rm(list = ls())
library(data.table)
library(plyr)
library(tidyr)
library(lfe)
library(stargazer)
library(xtable)
library(sandwich)
library(roll)
library(readxl)
library(readr)
library(zoo)
library(texreg)
library(DescTools)
library(ggplot2)

m <- data.table(read_xlsx("../Data/MasterData.xlsx", skip = 1, guess_max = 1e4))
m[, age := 2021 - year]
m[, matched := ifelse(!is.na(experience), 1, 0)]
m[, female := ifelse(substr(gender,1,1)=="B", 1, 0)]

m[substr(education, 1, 1) %in% c("A", "B", "C"), education_order := 1]
m[substr(education, 1, 1) %in% c("D"), education_order := 2]
m[substr(education, 1, 1) %in% c("E"), education_order := 3]
m[substr(education, 1, 1) %in% c("F"), education_order := 4]
m[substr(education, 1, 1) %in% c("G"), education_order := 5]
m[education_order >= 4, college := 1]
m[education_order < 4, college := 0]

m[substr(total_wealth, 1, 1) %in% c("A"), wealth_order := 1]
m[substr(total_wealth, 1, 1) %in% c("B"), wealth_order := 2]
m[substr(total_wealth, 1, 1) %in% c("C"), wealth_order := 3]
m[substr(total_wealth, 1, 1) %in% c("D"), wealth_order := 4]
m[substr(total_wealth, 1, 1) %in% c("E"), wealth_order := 5]
m[substr(total_wealth, 1, 1) %in% c("F"), wealth_order := 6]
m[substr(total_wealth, 1, 1) %in% c("G"), wealth_order := 7]
m[substr(total_wealth, 1, 1) %in% c("H"), wealth_order := 8]
m[wealth_order >= 6, wealth_1m := 1]
m[wealth_order < 6, wealth_1m := 0]

m[substr(total_income, 1, 1) %in% c("A"), income_order := 1]
m[substr(total_income, 1, 1) %in% c("B"), income_order := 2]
m[substr(total_income, 1, 1) %in% c("C"), income_order := 3]
m[substr(total_income, 1, 1) %in% c("D"), income_order := 4]
m[substr(total_income, 1, 1) %in% c("E"), income_order := 5]
m[substr(total_income, 1, 1) %in% c("F"), income_order := 6]
m[substr(total_income, 1, 1) %in% c("G"), income_order := 7]
m[substr(total_income, 1, 1) %in% c("H"), income_order := 8]
m[income_order >= 4, income_200k := 1]
m[income_order < 4, income_200k := 0]

m[substr(newscheck_freq, 1, 1) %in% c("A", "B"), dummy_newscheck_often := 1]
m[is.na(dummy_newscheck_often) & newscheck_freq != "G", dummy_newscheck_often := 0]

m[substr(accountcheck_freq, 1, 1) %in% c("A", "B"), dummy_accountcheck_often := 1]
m[is.na(dummy_accountcheck_often) & accountcheck_freq != "G", dummy_accountcheck_often := 0]

m[substr(discussion_freq, 1, 1) %in% c("A", "B"), dummy_discussion_often := 1]
m[is.na(dummy_discussion_often) & discussion_freq != "G", dummy_discussion_often := 0]

m[substr(num_wechat, 1, 1) %in% c("C", "D", "E"), dummy_num_wechat := 1]
m[is.na(dummy_num_wechat) & num_wechat != "F" & num_wechat != "G", dummy_num_wechat := 0]


varlist <- c("age", "female", "college", "wealth_1m", "income_200k", 
	"dummy_newscheck_often", "dummy_accountcheck_often", "dummy_discussion_often", "dummy_num_wechat",
	"agree", "consc", "extrv", "neuro", "opene")

summ <- data.table()
for (i in 1:length(varlist)) {
    m$var <- m[[varlist[i]]]
    C <- m[, .(var, matched)]
    C <- C[!is.na(var)]
    C <- C[, .(mean = mean(var), var = var(var), len = .N), by = .(matched)]
    tstat <- (C$mean[2] - C$mean[1]) / sqrt(C$var[1] / C$len[1] + C$var[2] / C$len[2])
    pval <- pt(abs(tstat), df = C$len[1] + C$len[2] - 2, lower.tail = FALSE) * 2

    summ <- rbind(
        summ,
        data.table(var = varlist[i], merged = C$mean[2], non_merged = C$mean[1], diff = C$mean[2] - C$mean[1], pval = pval)
    )
}

summ[, merged := sprintf("%.2f", merged)]
summ[, non_merged := sprintf("%.2f", non_merged)]
summ[, diff := sprintf("%.2f", diff)]
summ[pval < 0.01, diff := paste0(diff, "*")]
summ[pval < 0.05, diff := paste0(diff, "*")]
summ[pval < 0.10, diff := paste0(diff, "*")]

summ$var <- c("Age", "Female", "College", "Wealth$>$1M", "Income$>$200K", 
	"Often check news", "Often check account", "Often discuss", "Many WeChat groups",
	"Agreeableness", "Conscientiousness", "Extraversion", "Neuroticism", "Openness")
summ[, pval := NULL]
print.xtable(xtable(summ), include.rownames = F, sanitize.text.function = identity)
